data <- read_csv("data.csv")
head(data)
## # A tibble: 6 Ă— 11
## Year State_Abbr State Binge_Drinking_Prevalence Legalized Bachelors_Rate
## <dbl> <chr> <chr> <dbl> <dbl> <dbl>
## 1 2011 AL Alabama 13.7 0 22.3
## 2 2011 AK Alaska 20.8 1 26.4
## 3 2011 AZ Arizona 17.6 0 26.6
## 4 2011 AR Arkansas 14.1 0 20.3
## 5 2011 CA California 18.6 1 30.3
## 6 2011 CO Colorado 20.1 1 36.7
## # ℹ 5 more variables: Median_Age <dbl>, Urbanization_Rate <dbl>,
## # Legalization_Year <dbl>, G <dbl>, State_ID <dbl>
summary(data)
## Year State_Abbr State Binge_Drinking_Prevalence
## Min. :2011 Length:659 Length:659 Min. : 9.60
## 1st Qu.:2014 Class :character Class :character 1st Qu.:14.79
## Median :2017 Mode :character Mode :character Median :16.50
## Mean :2017 Mean :16.71
## 3rd Qu.:2020 3rd Qu.:18.30
## Max. :2023 Max. :27.20
##
## Legalized Bachelors_Rate Median_Age Urbanization_Rate
## Min. :0.0000 Min. :18.5 Min. :29.60 Min. :0.3117
## 1st Qu.:0.0000 1st Qu.:27.2 1st Qu.:37.05 1st Qu.:0.6161
## Median :0.0000 Median :30.9 Median :38.40 Median :0.7209
## Mean :0.1973 Mean :31.8 Mean :38.42 Mean :0.7224
## 3rd Qu.:0.0000 3rd Qu.:35.4 3rd Qu.:39.65 3rd Qu.:0.8551
## Max. :1.0000 Max. :65.9 Max. :45.10 Max. :1.0000
##
## Legalization_Year G State_ID
## Min. :2012 Min. : 0.0 Min. : 1.00
## 1st Qu.:2014 1st Qu.: 0.0 1st Qu.:13.00
## Median :2015 Median : 0.0 Median :26.00
## Mean :2015 Mean : 397.4 Mean :26.01
## 3rd Qu.:2016 3rd Qu.: 0.0 3rd Qu.:39.00
## Max. :2016 Max. :2016.0 Max. :51.00
## NA's :529
data <- data %>%
mutate(
Post = ifelse(!is.na(Legalization_Year) & Year >= Legalization_Year, 1, 0)
)
did_model <- feols(
Binge_Drinking_Prevalence ~ i(Post, Legalized, ref = 0) + Bachelors_Rate + Median_Age + Urbanization_Rate | State + Year,
data = data
)
summary(did_model)
## OLS estimation, Dep. Var.: Binge_Drinking_Prevalence
## Observations: 659
## Fixed-effects: State: 51, Year: 13
## Standard-errors: Clustered (State)
## Estimate Std. Error t value Pr(>|t|)
## Post::1:Legalized 0.135629 0.345874 0.392136 0.69662
## Bachelors_Rate 0.032055 0.109348 0.293149 0.77062
## Median_Age 0.059238 0.243106 0.243672 0.80848
## Urbanization_Rate 6.496083 22.503565 0.288669 0.77403
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 1.04364 Adj. R2: 0.86782
## Within R2: 0.002049
modelsummary(did_model,
stars = TRUE,
fmt = 3,
estimate = "{estimate} ({std.error})",
statistic = NULL,
title = "Difference-in-Differences Regression Results",
output = "markdown")
| (1) | |
|---|---|
| + p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001 | |
| Post = 1 Ă— Legalized | 0.136 (0.346) |
| Bachelors_Rate | 0.032 (0.109) |
| Median_Age | 0.059 (0.243) |
| Urbanization_Rate | 6.496 (22.504) |
| Num.Obs. | 659 |
| R2 | 0.881 |
| R2 Adj. | 0.868 |
| R2 Within | 0.002 |
| R2 Within Adj. | -0.005 |
| AIC | 2060.5 |
| BIC | 2361.3 |
| RMSE | 1.04 |
| Std.Errors | by: State |
| FE: State | X |
| FE: Year | X |
ggplot(data, aes(x = Year, y = Binge_Drinking_Prevalence, group = State, color = as.factor(Legalized))) +
geom_line(alpha = 0.5, size = 0.4) +
scale_color_manual(
values = c("0" = "darkgray", "1" = "darkgreen"),
labels = c("0" = "Not Legalized", "1" = "Legalized"),
name = "Legal Status"
) +
labs(
title = "Binge Drinking Prevalence Over Time by State",
subtitle = "Lines represent individual states; color indicates legalization status",
x = "Year",
y = "Binge Drinking Prevalence (%)"
) +
theme_minimal(base_size = 12) +
theme(legend.position = "bottom")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Create average trends by treatment status
avg_trends <- data %>%
group_by(Year, Legalized) %>%
summarize(
avg_binge = mean(Binge_Drinking_Prevalence, na.rm = TRUE),
.groups = "drop"
) %>%
mutate(
Legalized = ifelse(Legalized == 1, "Treated (Legalized)", "Control (Not Legalized)")
)
# Plot
ggplot(avg_trends, aes(x = Year, y = avg_binge, color = Legalized)) +
geom_line(size = 1.1) +
geom_point(size = 2) +
scale_color_manual(
values = c("Treated (Legalized)" = "darkgreen", "Control (Not Legalized)" = "gray60")
) +
labs(
title = "Average Binge Drinking Prevalence Over Time",
subtitle = "Comparing Treated vs. Control States",
x = "Year",
y = "Average Binge Drinking Prevalence (%)",
color = "Group"
) +
theme_minimal(base_size = 13) +
theme(legend.position = "bottom")
# Extract model estimates
did_df <- broom::tidy(did_model) %>%
filter(str_detect(term, "Post::1:Legalized")) %>%
mutate(
lower = estimate - 1.96 * std.error,
upper = estimate + 1.96 * std.error
)
# Plot
ggplot(did_df, aes(x = term, y = estimate)) +
geom_point(size = 3, color = "darkred") +
geom_errorbar(aes(ymin = lower, ymax = upper), width = 0.15, color = "darkred") +
geom_hline(yintercept = 0, linetype = "dashed", color = "gray60") +
labs(
title = "Estimated DiD Treatment Effect",
subtitle = "Effect of Marijuana Legalization on Binge Drinking Prevalence",
x = "",
y = "Estimated Effect (percentage points)"
) +
theme_minimal(base_size = 13)
# Treated states with cohort year
treated_data <- data %>%
filter(!is.na(Legalization_Year)) %>%
mutate(Cohort = as.factor(Legalization_Year))
# Control group
control_data <- data %>%
filter(is.na(Legalization_Year)) %>%
mutate(Cohort = "Control")
# Combine both groups
combined_data <- bind_rows(treated_data, control_data)
# Average binge drinking by group and year
avg_trends <- combined_data %>%
group_by(Year, Cohort) %>%
summarise(Avg_Binge = mean(Binge_Drinking_Prevalence, na.rm = TRUE), .groups = "drop")
# Get legalization years per cohort (excluding control)
cohort_lines <- treated_data %>%
distinct(Cohort, Legalization_Year)
# Plot
ggplot(avg_trends, aes(x = Year, y = Avg_Binge, color = Cohort)) +
geom_line(size = 1.1) +
geom_point(size = 2) +
# Add vertical line for each treated cohort
geom_vline(data = cohort_lines, aes(xintercept = Legalization_Year, color = Cohort),
linetype = "dashed", show.legend = FALSE) +
labs(
title = "Binge Drinking Trends by Legalization Cohort and Control Group",
subtitle = "Dashed lines mark legalization years for each cohort",
x = "Year",
y = "Avg. Binge Drinking Prevalence (%)",
color = "Group"
) +
theme_minimal(base_size = 13) +
theme(legend.position = "bottom")
# Group treated states by their cohort
cohort_states <- data %>%
filter(!is.na(Legalization_Year)) %>%
distinct(State, State_Abbr, Legalization_Year) %>%
mutate(Cohort = as.factor(Legalization_Year)) %>%
arrange(Cohort)
control_states <- data %>%
filter(is.na(Legalization_Year)) %>%
distinct(State, State_Abbr) %>%
mutate(Cohort = "Control") %>%
arrange(State)
# Combine treated and control states
state_cohorts <- bind_rows(cohort_states, control_states)
# View the result
print(state_cohorts)
## # A tibble: 51 Ă— 4
## State State_Abbr Legalization_Year Cohort
## <chr> <chr> <dbl> <chr>
## 1 Colorado CO 2012 2012
## 2 Washington WA 2012 2012
## 3 Alaska AK 2014 2014
## 4 District of Columbia DC 2014 2014
## 5 Oregon OR 2014 2014
## 6 California CA 2016 2016
## 7 Maine ME 2016 2016
## 8 Massachusetts MA 2016 2016
## 9 Nevada NV 2016 2016
## 10 Vermont VT 2016 2016
## # ℹ 41 more rows
#More detail on plot 5
# Label treated states with cohort (legalization year)
treated_data <- data %>%
filter(!is.na(Legalization_Year)) %>%
mutate(Cohort = as.factor(Legalization_Year))
# Create average line for control group
control_avg <- data %>%
filter(is.na(Legalization_Year)) %>%
group_by(Year) %>%
summarise(
Avg_Binge = mean(Binge_Drinking_Prevalence, na.rm = TRUE),
.groups = "drop"
)
# Plot
ggplot() +
# Plot individual treated states, color-coded by cohort
geom_line(data = treated_data,
aes(x = Year, y = Binge_Drinking_Prevalence, group = State, color = Cohort),
alpha = 0.7, size = 0.5) +
# Add control group average line
geom_line(data = control_avg,
aes(x = Year, y = Avg_Binge),
color = "grey60", linetype = "solid", size = 1) +
labs(
title = "Treated States Colored by Cohort; Control Group Averaged",
subtitle = "Each treated state shown individually; control group shown as single average line",
x = "Year",
y = "Binge Drinking Prevalence (%)",
color = "Legalization Cohort"
) +
theme_minimal(base_size = 13) +
theme(legend.position = "bottom")
# Extract regression summary as a tidy data frame
library(broom)
# Convert model summary to a data frame
df <- tidy(did_model)
# Add confidence intervals
df <- df %>%
mutate(
lower = estimate - 1.96 * std.error,
upper = estimate + 1.96 * std.error
)
# Plot
ggplot(df, aes(x = term, y = estimate)) +
geom_point(size = 3) +
geom_errorbar(aes(ymin = lower, ymax = upper), width = 0.2) +
geom_hline(yintercept = 0, linetype = "dashed", color = "gray40") +
coord_flip() +
labs(
title = "Coefficient Plot",
x = "Covariates",
y = "Estimate (with 95% CI)"
) +
theme_minimal(base_size = 14)
library(ggplot2)
ggplot(data, aes(x = Year, y = Binge_Drinking_Prevalence, group = State, color = as.factor(Legalized))) +
geom_line(alpha = 0.6) +
labs(
title = "Binge Drinking Prevalence Over Time by State",
x = "Year",
y = "Binge Drinking Prevalence (%)",
color = "Legalized"
) +
theme_minimal() +
theme(legend.position = "bottom")
data_aligned <- data %>%
filter(!is.na(Legalization_Year)) %>%
mutate(relative_year = Year - Legalization_Year)
ggplot(data_aligned, aes(x = relative_year, y = Binge_Drinking_Prevalence, group = State)) +
geom_line(alpha = 0.3, color = "gray") +
stat_summary(fun = mean, geom = "line", color = "orange", linewidth = .8) +
geom_vline(xintercept = 0, linetype = "dashed", color = "red") +
labs(
title = "Binge Drinking Trends Aligned to Legalization Year",
x = "Years Relative to Legalization",
y = "Binge Drinking Prevalence (%)"
) +
theme_minimal()
data_normalized <- data %>%
filter(!is.na(Legalization_Year)) %>%
mutate(relative_year = Year - Legalization_Year) %>%
group_by(State) %>%
mutate(
baseline = Binge_Drinking_Prevalence[relative_year == 0],
normalized_prevalence = Binge_Drinking_Prevalence - baseline
) %>%
ungroup()
ggplot(data_normalized, aes(x = relative_year, y = normalized_prevalence, group = State)) +
geom_line(alpha = 0.3, color = "gray") +
stat_summary(fun = mean, geom = "line", color = "orange", size = .5) +
geom_vline(xintercept = 0, linetype = "dashed", color = "red") +
labs(
title = "Normalized Binge Drinking Trends (Aligned to Legalization Year)",
x = "Years Relative to Legalization",
y = "Change in Binge Drinking Prevalence (%)"
) +
theme_minimal()
data_control <- data %>%
filter(Legalized == 0) %>%
mutate(relative_year = Year - 2016) %>%
group_by(State) %>%
mutate(
baseline = Binge_Drinking_Prevalence[Year == 2016],
normalized_prevalence = Binge_Drinking_Prevalence - baseline
) %>%
ungroup()
ggplot(data_control, aes(x = relative_year, y = normalized_prevalence, group = State)) +
geom_line(alpha = 0.3, color = "gray") +
stat_summary(fun = mean, geom = "line", color = "orange", size = .3) +
geom_vline(xintercept = 0, linetype = "dashed", color = "red") +
labs(
title = "Normalized Binge Drinking Trends (Control States Aligned at 2016)",
x = "Years Relative to 2016 (Placebo Treatment Year)",
y = "Change in Binge Drinking Prevalence (%)"
) +
theme_minimal()
# Add group label to each dataset
data_treated <- data %>%
filter(!is.na(Legalization_Year)) %>%
mutate(
relative_year = Year - Legalization_Year,
Group = "Treated"
) %>%
group_by(State) %>%
mutate(
baseline = Binge_Drinking_Prevalence[relative_year == 0],
normalized_prevalence = Binge_Drinking_Prevalence - baseline
) %>%
ungroup()
data_control <- data %>%
filter(Legalized == 0) %>%
mutate(
relative_year = Year - 2016,
Group = "Control"
) %>%
group_by(State) %>%
mutate(
baseline = Binge_Drinking_Prevalence[Year == 2016],
normalized_prevalence = Binge_Drinking_Prevalence - baseline
) %>%
ungroup()
# Combine both datasets
data_combined <- bind_rows(data_treated, data_control)
# Plot
ggplot(data_combined, aes(x = relative_year, y = normalized_prevalence, group = State)) +
geom_line(alpha = 0.15, color = "gray") +
stat_summary(aes(color = Group), fun = mean, geom = "line", size = 0.3) +
geom_vline(xintercept = 0, linetype = "dashed", color = "red") +
labs(
title = "Normalized Binge Drinking Trends: Treated vs. Control (Aligned by Year)",
x = "Years Relative to Treatment or 2016",
y = "Change in Binge Drinking Prevalence (%)",
color = "Group"
) +
theme_minimal() +
theme(legend.position = "bottom")
states_il <- c("IL", "NE", "KS", "IA", "AR", "OK")
did_data_il <- data %>%
filter(State_Abbr %in% states_il)
# Illinois legalized in 2020
did_data_il <- did_data_il %>%
mutate(
Treated = ifelse(State_Abbr == "IL", 1, 0),
Post = ifelse(Year >= 2020, 1, 0),
DiD = Treated * Post
)
did_model_il <- lm(
Binge_Drinking_Prevalence ~ DiD + Bachelors_Rate + Median_Age + Urbanization_Rate,
data = did_data_il
)
summary(did_model_il)
##
## Call:
## lm(formula = Binge_Drinking_Prevalence ~ DiD + Bachelors_Rate +
## Median_Age + Urbanization_Rate, data = did_data_il)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.7383 -1.8711 -0.7476 2.4484 6.3249
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -44.5560 15.8670 -2.808 0.00639 **
## DiD -8.2494 1.7921 -4.603 1.72e-05 ***
## Bachelors_Rate 0.2303 0.1186 1.942 0.05595 .
## Median_Age 1.2310 0.4023 3.060 0.00309 **
## Urbanization_Rate 13.8685 5.7830 2.398 0.01904 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.8 on 73 degrees of freedom
## Multiple R-squared: 0.3843, Adjusted R-squared: 0.3506
## F-statistic: 11.39 on 4 and 73 DF, p-value: 3.084e-07
library(broom)
tidy(did_model_il) %>%
filter(term == "DiD") %>%
ggplot(aes(x = term, y = estimate)) +
geom_point(size = 3) +
geom_errorbar(aes(ymin = estimate - 1.96 * std.error,
ymax = estimate + 1.96 * std.error), width = 0.2) +
labs(
title = "DiD Estimate: Illinois vs. Border States",
x = "",
y = "Estimated Effect on Binge Drinking (%)"
) +
theme_minimal()
# Filter and label data
did_data_il <- data %>%
filter(State_Abbr %in% states_il) %>%
mutate(Group = ifelse(State_Abbr == "IL", "Illinois (Treated)", "Control States"))
# Summarize by year and group
avg_trend <- did_data_il %>%
group_by(Year, Group) %>%
summarise(mean_prevalence = mean(Binge_Drinking_Prevalence, na.rm = TRUE), .groups = "drop")
# Plot
ggplot(avg_trend, aes(x = Year, y = mean_prevalence, color = Group)) +
geom_line(size = 1.1) +
geom_vline(xintercept = 2020, linetype = "dashed", color = "red") +
labs(
title = "Binge Drinking Prevalence Over Time",
subtitle = "Illinois (Treated) vs. Border States (Controls)",
x = "Year",
y = "Avg. Binge Drinking Prevalence (%)",
color = "Group"
) +
theme_minimal(base_size = 14)
```